{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import copy\n",
    "import pandas\n",
    "import numpy\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.grid_search import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class ReviewSentiment:\n",
    "    \n",
    "    ## Adapted from the Scikit learn tutorial, \"Working With Text Data.\" Retrieved from\n",
    "    ## http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html\n",
    "    _classifier = SGDClassifier(loss='hinge', \n",
    "                                penalty='l2',\n",
    "                                n_iter=5\n",
    "                               )\n",
    "    \n",
    "    # Calculated from `subset = reviews.sample(frac=.01, random_state=2016)` \n",
    "    # (see 'review_data_classify_tune' module)\n",
    "    _optimalParameters = {'vectorizer__ngram_range': (1, 2),\n",
    "                          'vectorizer__use_idf': True,\n",
    "                          'vectorizer__token_pattern': \"[a-z']{2,}\", \n",
    "                          'vectorizer__stop_words': 'english', \n",
    "                          'vectorizer__min_df': 0.01, \n",
    "                          'vectorizer__max_df': 0.99,\n",
    "                          'classifier__alpha': 0.001\n",
    "                         }\n",
    "    \n",
    "    def __init__(self, data, label, text, classifier=copy.deepcopy(_classifier)):\n",
    "        self.data = data\n",
    "        self.label = label\n",
    "        self.text = text\n",
    "        self.classifier = classifier\n",
    "        self.pipeline = Pipeline([('vectorizer', TfidfVectorizer()),\n",
    "                                  ('classifier', self.classifier)\n",
    "                                ])\n",
    "        \n",
    "    def tuneClassifier(self, tuningParameters, kfolds):\n",
    "    \n",
    "        self.pipeline = (GridSearchCV(estimator=self.pipeline, \n",
    "                                      param_grid=tuningParameters, \n",
    "                                      cv=kfolds, \n",
    "                                      n_jobs=-1, \n",
    "                                      error_score=numpy.nan\n",
    "                                     )\n",
    "                         .fit(self.data[self.text], self.data[self.label])\n",
    "                        )\n",
    "        \n",
    "    \n",
    "        return (pandas.DataFrame.from_dict([(params, mean_score, (scores.std() * 2)) \n",
    "                                            for params, mean_score, scores in self.pipeline.grid_scores_\n",
    "                                           ])\n",
    "                .rename(columns={0:'parameters', 1:'mean_score', 2:'confidence'})\n",
    "               )\n",
    "    \n",
    "    def predictSentiment(self, modelParameters):\n",
    "        (self.pipeline\n",
    "         .set_params(**modelParameters)\n",
    "         .fit(self.data[self.text], self.data[self.label])\n",
    "        )\n",
    "        self.data['sentiment'] = self.pipeline.predict(self.data[self.text])\n",
    "        return self.data\n",
    "    \n",
    "    def accuracy(self):\n",
    "        return numpy.mean(self.data['sentiment'] == self.data[self.label])\n",
    "    \n",
    "    def classificationReport(self):\n",
    "        return metrics.classification_report(self.data[self.label], self.data['sentiment'])\n",
    "    \n",
    "    def confusionMatrix(self):\n",
    "        return metrics.confusion_matrix(self.data[self.label], self.data['sentiment'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
